calc_p <- function(x, n) 2 * (1 - pnorm(mean(x) / (calc_sem(x, n))))

calc_sem <- function(x, n) sd(x) / sqrt(n)

cv_lasso <- function(folds, x, y, ...)
{
  lapply(seq_along(folds$train_sets), function(i) {
    lambda.1se <- cv.glmnet(x = x[folds$train_sets[[i]], ],
      y = y[folds$train_sets[[i]]])$lambda.1se
    model <- glmnet(x = x[folds$train_sets[[i]], ],
      y = y[folds$train_sets[[i]]], lambda = lambda.1se, ...)
    pred <- predict(model, x[folds$test_sets[[i]], ])
  })
}

cv_rf <- function (folds, x, y, ...)
{
  lapply(seq_along(folds$train_sets), function(i) {
    model <- randomForest(x = x[folds$train_sets[[i]], ],
      y = y[folds$train_sets[[i]]], ...)
    pred <- predict(model, x[folds$test_sets[[i]], ])
  })
}

cv_svm <- function(folds, x, y, ...)
{
  lapply(seq_along(folds$train_sets), function(i) {
    model <- svm(x = x[folds$train_sets[[i]], ],
      y = y[folds$train_sets[[i]]], ...)
    pred <- predict(model, x[folds$test_sets[[i]], ])
  })
}

cv_R2 <- function(pred_list, folds, y, ...)
{
  sapply(seq_along(folds$train_sets), function(i) {
    ypred <- pred_list[[i]]
    ytest <- y[folds$test_sets[[i]]]
    cor(ytest, ypred) ^ 2
  })
}
cv_rmse <- function(pred_list, folds, y, ...)
{
  sapply(seq_along(folds$train_sets), function(i) {
    ypred <- pred_list[[i]]
    ytest <- y[folds$test_sets[[i]]]
    sqrt(mean((ytest - ypred)^2))
  })
}
make_description_row <- function(x, varname)
{
  data.table(
    var = varname,
    mean = sprintf("%.02f", mean(x, na.rm = TRUE)),
    sd = sprintf("%.02f", sd(x, na.rm = TRUE)),
    min = ifelse(min(x, na.rm = TRUE) == 1,
      "1", ifelse(min(x, na.rm = TRUE) == 0, "0",
        sprintf("%.02f", min(x, na.rm = TRUE)))),
    max = ifelse(max(x, na.rm = TRUE) == 1,
      "1", ifelse(max(x, na.rm = TRUE) == 0, "0",
        sprintf("%.02f", max(x, na.rm = TRUE)))),
    missing = sum(is.na(x))
  )
}
make_outcome_row <- function(x, varname)
{
  x <- unlist(x)
  x <- c(
    var = varname,
    est = sprintf("%.04f", x[1]),
    se = sprintf("%.04f", x[2]),
    p = ifelse(x[4]<.001,
      "<0.001", sprintf("%0.3f", x[4]))
  )
  cat(x[1], "&$", paste0(x[-1], collapse = "$&$"), "$\\\\\n", sep = "")
}

cv_bigKRLS <- function (folds, x, y) {
  lapply(seq_along(folds$train_sets), function(i) {
    model <- bigKRLS::bigKRLS(
      X = x[folds$train_sets[[i]], ],
      y = y[folds$train_sets[[i]]],
      instructions = FALSE)
    pred <- predict(model, x[folds$test_sets[[i]], ])$predicted
  })
}

setstep <- function(x) {
  eps <- 1e-7
  x + (max(abs(x), 1, na.rm = TRUE) * sqrt(eps)) - x
}

make_plus_dx_dataset <- function(data, dx, type) {
  X0 <- data
  X1 <- data
  xx <- X0[, "dem_spend_adv"] + dx
  X1[, "dem_spend_adv"] <- xx
  X1
}

make_train_and_test_sets <- function(n, k) {
  perm <- sample(n, n)
  n_per_fold <- floor(n / k)
  test_sets <- lapply(1:k, function (i)
    perm[((i - 1) * n_per_fold + 1):(i * n_per_fold)])
  if (n %% k != 0) {
    test_sets[[k]] <- c(test_sets[[k]], perm[(k * n_per_fold + 1):n])
  }
  train_sets <- lapply(test_sets, setdiff, x = 1:n)
  list(test_sets = test_sets, train_sets = train_sets)
}

make_counterfactuals <- function(type_or_quantile, krls_model, data,
  n_boots = 5000) {
  # these are the only safe counterfactuals from the model
  stopifnot(
    type_or_quantile %in% c("zero", "actual") |
      (type_or_quantile >= .05 & type_or_quantile <= .95))
  pred_X <- copy(krls_model$X)
  actual_dem_spending_advantage <-
    pred_X[, "dem_spend_adv"] +
    pred_X[, "bottom_tail_DSA"] +
    pred_X[, "top_tail_DSA"]
  year <-
    1980 * pred_X[, "y80"] + 1982 * pred_X[, "y82"] +
    1984 * pred_X[, "y84"] + 1986 * pred_X[, "y86"] +
    1988 * pred_X[, "y88"] + 1990 * pred_X[, "y90"] +
    1992 * pred_X[, "y92"] + 1994 * pred_X[, "y94"] +
    1996 * pred_X[, "y96"] + 1998 * pred_X[, "y98"] +
    2000 * pred_X[, "y00"] + 2002 * pred_X[, "y02"] +
    2004 * pred_X[, "y04"] + 2006 * pred_X[, "y06"] +
    2008 * pred_X[, "y08"] + 2010 * pred_X[, "y10"] +
    2012 * pred_X[, "y12"] + 2014 * pred_X[, "y14"] +
    2016 * pred_X[, "y16"] + 2018 * pred_X[, "y18"]
  if (!is.na(as.numeric(type_or_quantile))) {
    counterfactual_dem_spending_advantage <-
      quantile(actual_dem_spending_advantage, type_or_quantile)
  }
  if (type_or_quantile == "actual") {
  } else if (type_or_quantile == "zero") {
    pred_X[, "dem_spend_adv"] <- 0
    pred_X[, "log_total_spending"] <- min(pred_X[, "log_total_spending"])
  } else if (counterfactual_dem_spending_advantage > 0) {
    rows_to_change <- which(
      pred_X[, "bottom_tail"] + pred_X[, "top_tail"] == 0 &
        actual_dem_spending_advantage < counterfactual_dem_spending_advantage)
    pred_X[rows_to_change, "dem_spend_adv"] <-
      counterfactual_dem_spending_advantage
    pred_X[rows_to_change, "log_total_spending"] <- log10(
      2 * data[rows_to_change, real_rep_expenditure_w_outside] +
        abs(counterfactual_dem_spending_advantage))
  } else if (counterfactual_dem_spending_advantage < 0) {
    # include .5 here because median obs has R spend more than D; i.e. DSA < 0
    counterfactual_dem_spending_advantage <-
      quantile(actual_dem_spending_advantage, type_or_quantile)
    rows_to_change <- which(
      pred_X[, "bottom_tail"] + pred_X[, "top_tail"] == 0 &
        actual_dem_spending_advantage > counterfactual_dem_spending_advantage)
    pred_X[rows_to_change, "dem_spend_adv"] <-
      counterfactual_dem_spending_advantage
    pred_X[rows_to_change, "log_total_spending"] <- log10(
      2 * data[rows_to_change, real_dem_expenditure_w_outside] +
        abs(counterfactual_dem_spending_advantage))
  }
  pred <- predict(krls_model, newdata = pred_X, se.pred = TRUE)
  sims <- MASS::mvrnorm(n_boots, pred$predicted,
    as.matrix(pred$vcov.est.pred))
  counterfactuals <- CJ(index = 1:nrow(pred_X), boot = 1:n_boots)
  counterfactuals[, `:=`(
    counterfactual_dem_spend_adv =
      (pred_X[, "dem_spend_adv"] +
          pred_X[, "bottom_tail_DSA"] +
          pred_X[, "top_tail_DSA"])[index],
    actual_dem_spending_advantage = actual_dem_spending_advantage[index],
    boot_y = as.vector(sims),
    year = year[index],
    type_or_quantile = type_or_quantile,
    nonoutlier = (1 - pred_X[, "bottom_tail"] - pred_X[, "top_tail"])[index]
  )]
  counterfactuals[, `:=`(
    party_of_winner = ifelse(boot_y > .5, "D", "R")
  )]
  counterfactuals[, .(
    n_dems = sum(boot_y > .5),
    n_reps = sum(boot_y < .5)), .(
      type_or_quantile, year, boot)]
}